library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggExtra) ; library(ggpubr)
library(biomaRt) ; library(DESeq2) ; library(sva) ; library(WGCNA) ; library(vsn)
library(dendextend) ; library(expss)
library(knitr) ; library(kableExtra)
library(GEOquery) ; library(limma)
Dataset downloded from two different places:
# LOAD DATA
# Expression data downliaded from GEMMA (https://gemma.msl.ubc.ca/expressionExperiment/showExpressionExperiment.html?id=11805)
datExpr = read.delim('./../Data/inputData/11805_GSE102741_expmat.data.txt.gz', comment.char='#')
datGenes_original = datExpr %>% dplyr::select(Probe, Sequence, GeneSymbol, GeneName, GemmaId, NCBIid) %>%
dplyr::rename('entrezgene' = NCBIid, 'hgnc_symbol' = GeneSymbol) %>%
mutate(entrezgene = entrezgene %>% as.character %>% as.integer)
datExpr = datExpr %>% dplyr::select(-c(Probe, Sequence, GeneSymbol, GeneName, GemmaId, NCBIid))
colnames(datExpr) = sapply(colnames(datExpr), function(x) strsplit(x, '\\.')[[1]][3]) %>% unname
# Metadata downloaded from GEO
GEO_data = getGEO('GSE102741', destdir='./../Data/inputData')[[1]]
datMeta = GEO_data@phenoData@data %>%
mutate(Diagnosis = factor(ifelse(grepl('control', characteristics_ch1), 'CTL', 'ASD'),
levels = c('CTL','ASD')),
Age = substring(characteristics_ch1.4, 6) %>% as.numeric %>% round,
Sex = `Sex:ch1`,
Sample_ID = description,
Ethnicity = substring(characteristics_ch1.6, 7),
title = gsub(' ', '', title)) %>%
dplyr::rename('Subject_ID' = title) %>%
dplyr::select(Subject_ID, geo_accession, Sample_ID, Diagnosis, Age, Sex, Ethnicity)
datMeta = datMeta[match(colnames(datExpr), datMeta$Subject_ID),]
# SFARI Genes
SFARI_genes = read_csv('./../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# NCBI biotype annotation
NCBI_biotype = read.csv('./../../NCBI/Data/gene_biotype_info.csv') %>%
dplyr::rename('ensembl_gene_id'=Ensembl_gene_identifier, 'gene_biotype'=type_of_gene,
'hgnc_symbol'=Symbol) %>%
mutate(gene_biotype = ifelse(gene_biotype=='protein-coding','protein_coding', gene_biotype))
Dataset consists of 52 samples (13 ASD and 39 Controls), all extracted from the DLPFC of the brain
Sequenced using Illumina’s HiSeq 2000 (Gupta used the same, Gandal used Illumina HiSeq 2500, they are compatible)
The dataset includes 25981 genes from 52 samples belonging to 52 different subjects
Counts distribution: The data has already been preprocessed, so we have relatively balanced values, centered close to 0
counts = datExpr %>% melt
count_distr = data.frame('Statistic' = c('Min', '1st Quartile', 'Median', 'Mean', '3rd Quartile', 'Max'),
'Values' = c(min(counts$value), quantile(counts$value, probs = c(.25, .5)) %>% unname,
mean(counts$value), quantile(counts$value, probs = c(.75)) %>% unname,
max(counts$value)))
count_distr %>% kable(digits = 2, format.args = list(scientific = FALSE)) %>% kable_styling(full_width = F)
| Statistic | Values |
|---|---|
| Min | -5.77 |
| 1st Quartile | -1.97 |
| Median | 1.06 |
| Mean | 0.89 |
| 3rd Quartile | 3.81 |
| Max | 16.47 |
counts %>% ggplot(aes(value)) + geom_density(color = '#993399', fill = '#993399', alpha = 0.3) + theme_minimal()
rm(counts, count_distr)
I was originally running this with the feb2014 version of BioMart because that’s the one that Gandal used (and it finds all of the Ensembl IDs, which other versions don’t), but it has some outdated biotype annotations, to fix this I’ll obtain all the information except the biotype label from BioMart in the same way as it had been done before, and then I’ll add the most current biotype label using information from NCBI’s website and then from BioMart in the following way:
Use BioMart to run a query with the original feb2014 version using the Ensembl IDs as keys to obtain all the information except the biotype labels
Annotate genes with Biotype labels:
2.1 Use the NCBI annotations downloaded from NCBI’s website and processed in NCBI/RMarkdowns/clean_data.html (there is information for only 26K genes, so some genes will remain unlabelled)
2.2 Use the current version (jan2020) to obtain the biotype annotations using the Ensembl ID as keys (some genes don’t return a match)
2.3 For the genes that didn’t return a match, use the current version (jan2020) to obtain the biotype annotations using the gene name as keys (17 genes return multiple labels)
2.4 For the genes that returned multiple labels, use the feb2014 version with the Ensembl IDs as keys
labels_source = data.frame('source' = c('NCBI', 'BioMart2020_byID', 'BioMart2020_byGene', 'BioMart2014'),
'n_matches' = rep(0,4))
########################################################################################
# 1. Query archive version
getinfo = c('entrezgene','ensembl_gene_id','external_gene_id','chromosome_name','start_position',
'end_position','strand')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
host = 'feb2014.archive.ensembl.org')
datGenes = getBM(attributes = getinfo, filters=c('entrezgene'), values = datGenes_original$entrezgene, mart=mart) %>%
right_join(datGenes_original %>% dplyr::select(entrezgene, hgnc_symbol), by = 'entrezgene')
datGenes$length = datGenes$end_position - datGenes$start_position
# ! There were some multiple matches between entrezgene and ensemblID
cat(paste0('1. ', sum(is.na(datGenes$start_position)), '/', nrow(datGenes),
' Ensembl IDs weren\'t found in the feb2014 version of BioMart'))
## 1. 7504/29332 Ensembl IDs weren't found in the feb2014 version of BioMart
########################################################################################
########################################################################################
# 2. Get Biotype Labels
cat('2. Add biotype information')
## 2. Add biotype information
########################################################################################
# 2.1 Add NCBI annotations
datGenes = datGenes %>% left_join(NCBI_biotype, by=c('ensembl_gene_id','hgnc_symbol'))
cat(paste0('2.1 ' , sum(is.na(datGenes$gene_biotype)), '/', nrow(datGenes),
' Ensembl IDs weren\'t found in the NCBI database'))
## 2.1 12176/29332 Ensembl IDs weren't found in the NCBI database
labels_source$n_matches[1] = sum(!is.na(datGenes$gene_biotype))
########################################################################################
# 2.2 Query current BioMart version for gene_biotype using Ensembl ID as key
getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
host = 'jan2020.archive.ensembl.org')
datGenes_biotype = getBM(attributes = getinfo, filters = c('ensembl_gene_id'), mart=mart,
values = datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])
cat(paste0('2.2 ' , sum(is.na(datGenes$gene_biotype))-nrow(datGenes_biotype), '/',
sum(is.na(datGenes$gene_biotype)),
' Ensembl IDs weren\'t found in the jan2020 version of BioMart when querying by Ensembl ID'))
## 2.2 9704/12176 Ensembl IDs weren't found in the jan2020 version of BioMart when querying by Ensembl ID
# Add new gene_biotype info to datGenes
datGenes = datGenes %>% left_join(datGenes_biotype, by='ensembl_gene_id') %>%
mutate(gene_biotype = coalesce(as.character(gene_biotype.x), gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y)
labels_source$n_matches[2] = sum(!is.na(datGenes$gene_biotype)) - labels_source$n_matches[1]
########################################################################################
# 3. Query current BioMart version for gene_biotype using gene symbol as key
missing_genes = unique(datGenes$hgnc_symbol[is.na(datGenes$gene_biotype)])
getinfo = c('hgnc_symbol','gene_biotype')
datGenes_biotype_by_gene = getBM(attributes=getinfo, filters=c('hgnc_symbol'), mart=mart,
values=missing_genes)
cat(paste0('2.3 ', length(missing_genes)-length(unique(datGenes_biotype_by_gene$hgnc_symbol)),'/',
length(missing_genes),
' genes weren\'t found in the current BioMart version when querying by gene name'))
## 2.3 6490/8998 genes weren't found in the current BioMart version when querying by gene name
dups = unique(datGenes_biotype_by_gene$hgnc_symbol[duplicated(datGenes_biotype_by_gene$hgnc_symbol)])
cat(paste0(' ', length(dups), ' genes returned multiple labels (these won\'t be added)'))
## 14 genes returned multiple labels (these won't be added)
# Update information
datGenes_biotype_by_gene = datGenes_biotype_by_gene %>% filter(!hgnc_symbol %in% dups)
datGenes = datGenes %>% left_join(datGenes_biotype_by_gene, by='hgnc_symbol') %>%
mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y)
labels_source$n_matches[3] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)
########################################################################################
# 4. Query feb2014 BioMart version for the missing biotypes
missing_ensembl_ids = unique(datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])
getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
host = 'feb2014.archive.ensembl.org')
datGenes_biotype_archive = getBM(attributes = getinfo, filters=c('ensembl_gene_id'),
values = missing_ensembl_ids, mart=mart)
cat(paste0('2.4 ', length(missing_ensembl_ids)-nrow(datGenes_biotype_archive),'/',length(missing_ensembl_ids),
' genes weren\'t found in the feb2014 BioMart version when querying by Ensembl ID'))
## 2.4 1/170 genes weren't found in the feb2014 BioMart version when querying by Ensembl ID
# Update information
datGenes = datGenes %>% left_join(datGenes_biotype_archive, by='ensembl_gene_id') %>%
mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y)
labels_source$n_matches[4] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)
########################################################################################
# Plot results
labels_source = labels_source %>% mutate(x = 1, percentage = round(100*n_matches/sum(n_matches),1))
p = labels_source %>% ggplot(aes(x, percentage, fill=source)) + geom_bar(position='stack', stat='identity') +
theme_minimal() + coord_flip() + theme(legend.position='bottom', axis.title.y=element_blank(),
axis.text.y=element_blank(), axis.ticks.y=element_blank())
ggplotly(p + theme(legend.position='none'))
as_ggplot(get_legend(p))
########################################################################################
# Remove repeated entries by entrezgene
# Remove ensembl IDS with 'LRG' notation
datGenes = datGenes %>% filter(!grepl('LRG', ensembl_gene_id) & !is.na(entrezgene)) %>%
distinct(entrezgene, .keep_all = TRUE)
########################################################################################
# Reorder rows to match datExpr
datGenes = datGenes[match(datGenes_original$entrezgene, datGenes$entrezgene),]
rm(getinfo, mart, datGenes_biotype, datGenes_biotype_by_gene, datGenes_biotype_archive,
dups, missing_ensembl_ids, missing_genes, labels_source, p)
The neuronal function is obtained from the Gene Ontology annotations for each gene, defining each gene that contains the substring ‘neuron’ as having a neuronal function. The pipeline to process and clean each gene’s GO annotations can be found in ./../Data/inputData/genes_GO_annotations.csv
GO_annotations = read.csv('./../Data/inputData/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuro', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>% mutate('Neuronal'=1)
rm(GO_annotations)
Checking how many SFARI genes are in the dataset
df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))
n_SFARI = df[['gene-symbol']] %>% unique %>% length
Considering all genes, this dataset contains 786 of the 912 SFARI genes
1.- Remove rows that don’t correspond to genes
This dataset doesn’t have non-gene rows, but it does have missing gene_biotype information
to_keep = !is.na(datGenes$gene_biotype)
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
Removed 6335 ‘genes’, 19646 remainin
2. Keep only protein coding genes
81% of the genes are protein coding genes
datGenes$gene_biotype %>% table %>% sort(decreasing=TRUE) %>% kable(caption='Biotypes of genes in dataset') %>%
kable_styling(full_width = F)
| . | Freq |
|---|---|
| protein_coding | 15977 |
| 1 | 1454 |
| lncRNA | 1330 |
| 3 | 261 |
| transcribed_unprocessed_pseudogene | 194 |
| processed_pseudogene | 122 |
| unprocessed_pseudogene | 79 |
| 6 | 47 |
| transcribed_processed_pseudogene | 35 |
| transcribed_unitary_pseudogene | 30 |
| lincRNA | 22 |
| snoRNA | 15 |
| antisense | 14 |
| pseudogene | 12 |
| processed_transcript | 9 |
| 7 | 8 |
| TEC | 8 |
| polymorphic_pseudogene | 6 |
| misc_RNA | 5 |
| rRNA | 5 |
| snRNA | 5 |
| unitary_pseudogene | 3 |
| miRNA | 1 |
| scaRNA | 1 |
| sense_intronic | 1 |
| TR_C_gene | 1 |
| translated_unprocessed_pseudogene | 1 |
Most of the non-protein coding genes have very low levels of expression
plot_data = data.frame('ID' = rownames(datExpr), 'MeanExpr' = apply(datExpr, 1, mean),
'ProteinCoding' = datGenes$gene_biotype=='protein_coding')
ggplotly(plot_data %>% ggplot(aes(MeanExpr, fill=ProteinCoding, color=ProteinCoding)) +
geom_density(alpha=0.5) + theme_minimal())
rm(plot_data)
df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))
Filtering protein coding genes, we are left with 783 SFARI Genes (we lose 3 genes)
Note: The gene name for Ensembl ID ENSG00000187951 is wrong, it should be AC091057.1 instead of ARHGAP11B, but the biotype is right, so it would still be filtered out
n_SFARI = df[['gene-symbol']][df$gene_biotype=='protein_coding'] %>% unique %>% length
df %>% filter(!`gene-symbol` %in% df$`gene-symbol`[df$gene_biotype=='protein_coding']) %>%
dplyr::select(ID, `gene-symbol`, `gene-score`, gene_biotype, syndromic, `number-of-reports`) %>%
kable(caption='Lost Genes') %>% kable_styling(full_width = F)
| ID | gene-symbol | gene-score | gene_biotype | syndromic | number-of-reports |
|---|---|---|---|---|---|
| ENSG00000187951 | ARHGAP11B | 3 | 3 | 0 | 2 |
| ENSG00000187951 | ARHGAP11B | 3 | lncRNA | 0 | 2 |
| ENSG00000224639 | C4B | 3 | lncRNA | 0 | 6 |
| ENSG00000233067 | PTCHD1-AS | 2 | lncRNA | 0 | 3 |
rm(df)
to_keep = datGenes$gene_biotype=='protein_coding'
datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)
rownames(datExpr) = datGenes$entrezgene
rownames(datGenes) = datGenes$entrezgene
Removed 3669 genes. 15977 remaining
3. Remove genes with low levels of expression
This seems to have already been done in the original preprocessing of the data, so I won’t do it again
4. Remove outlier samples
Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)
\(\qquad\) - Gandal uses the formula \(s_{ij}=\frac{1+bw(i,j)}{2}\) to convert all the weights to positive values, but I used \(s_{ij}=|bw(i,j)|\) instead because I think it makes more sense. In the end it doesn’t matter because they select as outliers the same six samples
\(\qquad\) - All the outlier samples belong to the ASD group. Apart from this they don’t seem to have any characterstic in common (different subjects, extraction batches, brain lobes, age, PMI), except for Sex (but this is probably just because of the sex bias in the dataset)
absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))
plot_data = data.frame('sample'=1:length(z.ku), 'distance'=z.ku, 'Sample_ID'=datMeta$Sample_ID,
'Subject_ID'=datMeta$Subject_ID,
'Sex'=datMeta$Sex, 'Age'=datMeta$Age,
'Diagnosis'=datMeta$Diagnosis, 'Ethnicity'=datMeta$Ethnicity)
selectable_scatter_plot(plot_data, plot_data[,-c(1:3)])
Outlier samples: sample25, sample3, sample31
to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
rm(absadj, netsummary, ku, z.ku, plot_data)
Removed 3 samples, 49 remaining
rm(to_keep)
5. Remove repeated genes
There are no repeated genes in this dataset, so instead, I’m going to replace this step with:
5. Remove genes without or repeated ensembl ID
There are 194 genes without ensembl ID and 450
to_keep = !is.na(datGenes$ensembl_gene_id) & !duplicated(datGenes$ensembl_gene_id)
datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)
rownames(datExpr) = datGenes$ensembl_gene_id
rm(to_keep)
After filtering, the dataset consists of 15526 genes and 49 samples
save(datExpr, datMeta, datGenes, file='./../Data/preprocessedData/filtered_raw_data.RData')
#load('./../Data/preprocessedData/filtered_raw_data.RData')
Note: No batch correction is performed in this section, this is done after the normalisation step
There are no knowns sources of batch effects in this dataset
Using the sva function from limma following this tutorial
mod = model.matrix(~Diagnosis, data = datMeta)
sva_fit = sva(datExpr %>% as.matrix, mod)
## Number of significant surrogate variables is: 11
## Iteration (out of 5 ):1 2 3 4 5
#mod = model.matrix(~Diagnosis, data = datMeta)
#mod0 = model.matrix(~1, data = datMeta)
#sva_fit = svaseq(exp(datExpr) %>% as.matrix, mod = mod, mod0 = mod0)
rm(mod)
Found 11 surrogate variables
Include SV estimations to datMeta information
sv_data = sva_fit$sv %>% data.frame
colnames(sv_data) = paste0('SV', 1:ncol(sv_data))
datMeta = cbind(datMeta, sv_data)
rm(sv_data, sva_fit)
This data has already been normalised and transformed, so we are only going to perform the differential expression analysis.
Since our data has already been preprocessed, we cannot use DESeq2 to perform the differential expression analysis. instead I’m going to use limma following the procedure in RNA-seq analysis is easy as 1-2-3 with limma, Glimma and edgeR
fit = lmFit(datExpr, design=model.matrix(~SV1 + SV2 + SV3 + SV4 + SV5 + SV6 + SV7 + SV8 + SV9 + SV10 + SV11 +
Diagnosis, data = datMeta))
efit = eBayes(fit)
DE_info = topTable(efit, sort.by = 'none', n = Inf, coef = 'DiagnosisASD')
DE_info$shrunken_log2FoldChange = predFCm(efit, coef = 'DiagnosisASD')
rm(fit, efit)
The original LFC has a positive relation with the level of expression of the genes, which is inverted by the shrunken LFC (Perhaps this is a consequence of the inverted relation between mean and SD in this dataset?)
NOTE: The author of limma does not recommend to perform log fold change shrinkage in the DE step, claiming it is already over-done at the normalisation step and when it isn’t, it should be done in other stages of the processing pipeline, not here, so I’m not sure which of the LFC estimations is more reliable in this case. I’ll keep both (https://support.bioconductor.org/p/100804/)
p1 = data.frame('ID'=rownames(datExpr), 'meanExpr'=rowMeans(datExpr)) %>%
left_join(DE_info %>% data.frame %>% mutate(ID = rownames(.), significant = adj.P.Val<0.05), by='ID') %>%
ggplot(aes(meanExpr, abs(logFC), color = significant)) +
geom_point(alpha = 0.1, shape = 1) + geom_smooth(alpha = 0.01, aes(color=significant)) +
xlab('Mean Expression') + ylab('Original LFC Magnitude') + scale_y_sqrt() +
theme_minimal() + theme(legend.position = 'none')
p2 = data.frame('ID'=rownames(datExpr), 'meanExpr'=rowMeans(datExpr)) %>%
left_join(DE_info %>% data.frame %>% mutate(ID = rownames(.), significant = adj.P.Val<0.05), by='ID') %>%
ggplot(aes(meanExpr, abs(shrunken_log2FoldChange), color = significant)) + scale_y_sqrt() +
geom_point(alpha = 0.1, shape = 1) + geom_smooth(alpha = 0.2) + xlab('Mean Expression') +
ylab('Shrunken LFC Magnitude') + theme_minimal() + labs(color = 'DE')
ggarrange(p1,p2, nrow = 1, common.legend = TRUE, legend = 'bottom')
rm(p1, p2)
The data has a larger variance in genes with the lowest levels of expression than in the rest of the dataset
data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd)) %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.2) + geom_smooth(color = 'gray') +
theme_minimal()
By including the surrogate variables in the DESeq formula we only modelled the batch effects into the DEA, but we didn’t actually correct them from the data, for that we need to use ComBat (or other equivalent package) in the already normalised data
In some places they say you shouldn’t correct these effects on the data because you risk losing biological variation, in others they say you should because they introduce noise to the data. The only thing everyone agrees on is that you shouldn’t remove them before performing DEA but instead include them in the model.
Based on the conclusions from Practical impacts of genomic data “cleaning” on biological discovery using surrogate variable analysis it seems like it may be a good idea to remove the batch effects from the data and not only from the DE analysis:
Using SVA, ComBat or related tools can increase the power to identify specific signals in complex genomic datasets (they found “greatly sharpened global and gene-specific differential expression across treatment groups”)
But caution should be exercised to avoid removing biological signal of interest
We must be precise and deliberate in the design and analysis of experiments and the resulting data, and also mindful of the limitations we impose with our own perspective
Open data exploration is not possible after such supervised “cleaning”, because effects beyond those stipulated by the researcher may have been removed
# Taken from https://www.biostars.org/p/121489/#121500
correctDatExpr = function(datExpr, mod, svs) {
X = cbind(mod, svs)
Hat = solve(t(X) %*% X) %*% t(X)
beta = (Hat %*% t(datExpr))
rm(Hat)
gc()
P = ncol(mod)
return(datExpr - t(as.matrix(X[,-c(1:P)]) %*% beta[-c(1:P),]))
}
pca_samples_before = datExpr %>% t %>% prcomp
pca_genes_before = datExpr %>% prcomp
# Correct
mod = model.matrix(~ Diagnosis, datMeta)
svs = datMeta %>% dplyr::select(SV1:SV8) %>% as.matrix
datExpr_corrected = correctDatExpr(as.matrix(datExpr), mod, svs)
pca_samples_after = datExpr_corrected %>% t %>% prcomp
pca_genes_after = datExpr_corrected %>% prcomp
rm(correctDatExpr)
Removing batch effects improves the separation between diagnosis groups, but not as much as with Gandal’s dataset
pca_samples_df = rbind(data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_before$x[,1],
'PC2'=pca_samples_before$x[,2], 'corrected'=0),
data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_after$x[,1],
'PC2'=pca_samples_after$x[,2], 'corrected'=1)) %>%
left_join(datMeta %>% mutate('ID'=datMeta$Subject_ID), by='ID')
ggplotly(pca_samples_df %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) +
geom_point(aes(frame=corrected, id=ID), alpha=0.75) +
xlab(paste0('PC1 (corr=', round(cor(pca_samples_before$x[,1],pca_samples_after$x[,1]),2),
'). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,1],1),' to ',
round(100*summary(pca_samples_after)$importance[2,1],1))) +
ylab(paste0('PC2 (corr=', round(cor(pca_samples_before$x[,2],pca_samples_after$x[,2]),2),
'). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,2],1),' to ',
round(100*summary(pca_samples_after)$importance[2,2],1))) +
ggtitle('Samples') + theme_minimal())
rm(pca_samples_df)
It seems like the sva correction preserves the mean expression of the genes and erases almost everything else (although what little else remains is enough to characterise the two Diagnosis groups pretty well using only the first PC)
*Plot is done with only 10% of the genes so it’s not that heavy
pca_genes_df = rbind(data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_before$x[,1],
'PC2'=pca_genes_before$x[,2], 'corrected'=0, 'MeanExpr'=rowMeans(datExpr)),
data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_after$x[,1],
'PC2'=pca_genes_after$x[,2], 'corrected'=1, 'MeanExpr'=rowMeans(datExpr)))
keep_genes = rownames(datExpr) %>% sample(0.1*nrow(datExpr))
pca_genes_df = pca_genes_df %>% filter(ID %in% keep_genes)
ggplotly(pca_genes_df %>% ggplot(aes(PC1, PC2,color=MeanExpr)) +
geom_point(alpha=0.3, aes(frame=corrected, id=ID)) +
xlab(paste0('PC1 (corr=', round(cor(pca_genes_before$x[,1],pca_genes_after$x[,1]),2),
'). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,1],1),' to ',
round(100*summary(pca_genes_after)$importance[2,1],1))) +
ylab(paste0('PC2 (corr=', round(cor(pca_genes_before$x[,2],pca_genes_after$x[,2]),2),
'). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,2],1),' to ',
round(100*summary(pca_genes_after)$importance[2,2],1))) +
scale_color_viridis() + ggtitle('Genes') + theme_minimal())
rm(pca_samples_before, pca_genes_before, mod, svs, pca_samples_after, pca_genes_after, pca_genes_df, keep_genes)
Everything looks good, so we’re keeping the corrected expression dataset
datExpr = datExpr_corrected
rm(datExpr_corrected)
There are no known sources of batch effects in this dataset
# Join original and shrunken LFC values
genes_info = DE_info %>% data.frame %>%
dplyr::rename('log2FoldChange' = logFC, 'padj' = adj.P.Val, 'baseMean' = AveExpr) %>%
mutate(significant=padj<0.05 & !is.na(padj)) %>%
add_column('ID'=rownames(datExpr), 'entrezgene'=datGenes$entrezgene, .before='baseMean') %>%
left_join(GO_neuronal, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal))
save(datExpr, datMeta, datGenes, genes_info, file='./../Data/preprocessedData/preprocessed_data.RData')
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] limma_3.40.6 GEOquery_2.52.0
## [3] kableExtra_1.1.0 knitr_1.32
## [5] expss_0.10.2 dendextend_1.13.4
## [7] vsn_3.52.0 WGCNA_1.69
## [9] fastcluster_1.1.25 dynamicTreeCut_1.63-1
## [11] sva_3.32.1 genefilter_1.66.0
## [13] mgcv_1.8-31 nlme_3.1-147
## [15] DESeq2_1.24.0 SummarizedExperiment_1.14.1
## [17] DelayedArray_0.10.0 BiocParallel_1.18.1
## [19] matrixStats_0.56.0 Biobase_2.44.0
## [21] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0
## [23] IRanges_2.18.3 S4Vectors_0.22.1
## [25] BiocGenerics_0.30.0 biomaRt_2.40.5
## [27] ggpubr_0.2.5 magrittr_2.0.1
## [29] ggExtra_0.9 GGally_1.5.0
## [31] gridExtra_2.3 viridis_0.5.1
## [33] viridisLite_0.3.0 RColorBrewer_1.1-2
## [35] plotlyutils_0.0.0.9000 plotly_4.9.2
## [37] glue_1.4.2 reshape2_1.4.4
## [39] forcats_0.5.0 stringr_1.4.0
## [41] dplyr_1.0.1 purrr_0.3.4
## [43] readr_1.3.1 tidyr_1.1.0
## [45] tibble_3.0.3 ggplot2_3.3.2
## [47] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.8 Hmisc_4.4-0
## [4] plyr_1.8.6 lazyeval_0.2.2 splines_3.6.3
## [7] crosstalk_1.1.0.1 digest_0.6.27 foreach_1.5.0
## [10] htmltools_0.5.1.1 GO.db_3.8.2 fansi_0.4.1
## [13] checkmate_2.0.0 memoise_1.1.0 doParallel_1.0.15
## [16] cluster_2.1.0 annotate_1.62.0 modelr_0.1.6
## [19] prettyunits_1.1.1 jpeg_0.1-8.1 colorspace_1.4-1
## [22] blob_1.2.1 rvest_0.3.5 haven_2.2.0
## [25] xfun_0.22 crayon_1.3.4 RCurl_1.98-1.2
## [28] jsonlite_1.7.2 impute_1.58.0 iterators_1.0.12
## [31] survival_3.2-7 gtable_0.3.0 zlibbioc_1.30.0
## [34] XVector_0.24.0 webshot_0.5.2 scales_1.1.1
## [37] DBI_1.1.0 miniUI_0.1.1.1 Rcpp_1.0.6
## [40] xtable_1.8-4 progress_1.2.2 htmlTable_1.13.3
## [43] foreign_0.8-76 bit_4.0.4 preprocessCore_1.46.0
## [46] Formula_1.2-3 htmlwidgets_1.5.1 httr_1.4.2
## [49] acepack_1.4.1 ellipsis_0.3.1 farver_2.0.3
## [52] pkgconfig_2.0.3 reshape_0.8.8 XML_3.99-0.3
## [55] nnet_7.3-14 dbplyr_1.4.2 locfit_1.5-9.4
## [58] labeling_0.3 tidyselect_1.1.0 rlang_0.4.10
## [61] later_1.1.0.1 AnnotationDbi_1.46.1 munsell_0.5.0
## [64] cellranger_1.1.0 tools_3.6.3 cli_2.0.2
## [67] generics_0.0.2 RSQLite_2.2.0 broom_0.7.0
## [70] evaluate_0.14 fastmap_1.0.1 yaml_2.2.1
## [73] bit64_4.0.5 fs_1.5.0 mime_0.10
## [76] xml2_1.2.5 compiler_3.6.3 rstudioapi_0.11
## [79] curl_4.3 png_0.1-7 affyio_1.54.0
## [82] ggsignif_0.6.0 reprex_0.3.0 geneplotter_1.62.0
## [85] stringi_1.5.3 highr_0.8 lattice_0.20-41
## [88] Matrix_1.2-18 vctrs_0.3.2 pillar_1.4.6
## [91] lifecycle_0.2.0 BiocManager_1.30.10 cowplot_1.0.0
## [94] data.table_1.12.8 bitops_1.0-6 httpuv_1.5.5
## [97] affy_1.62.0 R6_2.5.0 latticeExtra_0.6-29
## [100] promises_1.2.0.1 codetools_0.2-16 assertthat_0.2.1
## [103] withr_2.2.0 GenomeInfoDbData_1.2.1 hms_0.5.3
## [106] grid_3.6.3 rpart_4.1-15 rmarkdown_2.7
## [109] shiny_1.4.0.2 lubridate_1.7.4 base64enc_0.1-3